Imports
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score, auc, recall_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
In [2]:
%matplotlib inline
In [3]:
sns.set()
(a) Original owners: National Institute of Diabetes and Digestive and Kidney Diseases
(b) Donor of database: Vincent Sigillito (vgs@aplcen.apl.jhu.edu), Applied Physics Laboratory, The Johns Hopkins University
(c) Date received: 9 May 1990
Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage. ADAP is an adaptive learning routine that generates and executes digital analogs of perceptron-like
devices. It is a unique algorithm; see the paper for details.
Class | Number of instances |
---|---|
0 | 500 |
1 | 268 |
Attribute number: | Mean: | Standard Deviation: |
---|---|---|
1. | 3.8 | 3.4 |
2. | 120.9 | 32.0 |
3. | 69.1 | 19.4 |
4. | 20.5 | 16.0 |
5. | 79.8 | 115.2 |
6. | 32.0 | 7.9 |
7. | 0.5 | 0.3 |
8. | 33.2 | 11.8 |
In [4]:
df = pd.read_csv('../data/pima-indians-diabetes-data.csv', index_col=[0])
In [5]:
df.head()
Out[5]:
In [6]:
df.describe()
Out[6]:
Look at class distribution
In [7]:
len(df[df['class'] == 1]), len(df[df['class'] == 0])
Out[7]:
In [8]:
sns.pairplot(df, x_vars=['plasma_glucose_c', 'blood_presure', 'BMI'],
y_vars=['plasma_glucose_c', 'blood_presure', 'BMI'], hue='class')
Out[8]:
In [9]:
X = df.drop('class', axis=1).values
y = df['class'].values
In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
Train the model
In [11]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
Out[11]:
In [12]:
y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)
Helper functions that facilitate plotting
In [13]:
def plot_hist(y, scores, title, size=(1.5,1.5)):
fig = plt.figure(figsize=size, dpi=80)
axes = fig.add_axes([0, 0, 1, 1])
bins = np.linspace(0, 1, 11)
axes.hist([x[0] for x in zip(scores, y) if x[1] == 1], bins, alpha=0.5, color= 'b')
axes.hist([x[0] for x in zip(scores, y) if x[1] == 0], bins, alpha=0.5, color= 'r')
axes.vlines(0.5, 0, np.histogram(scores, bins)[0].max(), color='black', linestyles='--')
axes.set_ylim((0, np.histogram(scores, bins)[0].max()))
axes.set_xlabel(title)
axes.set_ylabel('#')
return fig
def plot_ROC(observations, probabilities, title="", labels=True, size='auto'):
"""
Creates ROC plot from observations (y_test) and probabilities (y_pred_proba)
title -- title of the plot
size -- tuple, size in inch, defaults to 'auto'
labels -- toogle display of title and x and y labels and tick labels
"""
if size is 'auto':
fig = plt.figure()
else:
fig = plt.figure(num=None, figsize=size, dpi=80)
axes = fig.add_axes([0, 0, 1, 1])
fpr, tpr, thresholds = roc_curve(observations, probabilities)
axes.plot(fpr, tpr)
axes.plot([0, 1], [0, 1], 'k--')
axes.set_aspect('equal')
if labels:
axes.set_title(title)
axes.set_xlabel('False Positive Rate')
axes.set_ylabel('True Positive Rate')
else:
axes.get_xaxis().set_ticks([])
axes.get_yaxis().set_ticks([])
return fig
Plot distribution of probabilities
In [14]:
plot_hist(y_test, y_pred_proba.T[1], 'probability', size=(3,3));
Plot ROC curve
In [15]:
plot_ROC(y_test, y_pred_proba.T[1], size=(3,3));
In [16]:
print("AUC: %.3f" % roc_auc_score(y_test, y_pred_proba.T[1]))
Calculate confusion matrix
Predicted Positive | Predicted Negative | |
---|---|---|
Positive | TP | FN |
Negative | FP | TN |
In [17]:
confusion_matrix(y_test, y_pred)
Out[17]:
In [18]:
recall_score(y_test, y_pred, pos_label=1) # Low-moderate sensitivity
Out[18]:
In [19]:
recall_score(y_test, y_pred, pos_label=0) # High specificity
Out[19]:
In [ ]: